Loading of R packages, setting seed for reproducability, and session info to provide version numbers.
knitr::opts_chunk$set(echo = TRUE, message = FALSE,
warning = FALSE)
rm(list = ls())
library('DESeq2')
library('ComplexHeatmap')
library('EnhancedVolcano')
library('DT')
library('fgsea')
library(ggplot2)
library(ggpubr)
library(dplyr)
#library(tidyr)
library(data.table)
library(msigdbr)
#library(Matrix)
set.seed(1234)
# set root folder
rootDir = "../"
# load shared parameters and functions
source(paste0(rootDir,"parameters_and_functions.r"))
exprsDat = read.csv(paste0(rootDir,"input_data/table_RNAseq.csv"), row.names = 1)
dim(exprsDat)
## [1] 26467 45
#=========================
# read in patient annotation and mapping
patTab = read.csv(file = paste0(rootDir,"input_data/table_patients.csv"), row.names = 1)
# switch to R/NR annotation
patTab = patTab %>% mutate(RECIST = factor(RECIST,levels = c("Responder", "NonResponder"), labels = c("R","NR")))
datatable(patTab)
# sample annotation
sampAnnot = data.frame(
patientID = paste(sapply(strsplit(colnames(exprsDat), '_'), geti,1), sapply(strsplit(colnames(exprsDat), '_'), geti,2), sep = '_'),
timepoint = sapply(strsplit(colnames(exprsDat), '_'), geti,3),
check.names = F)
rownames(sampAnnot) = colnames(exprsDat)
table(sampAnnot$timepoint)
##
## Baseline C1D1 C2D1
## 23 18 4
# find pairs
mat = printPairs(sampAnnot)
## The number of paired samples between Baseline and C1D1: 16
## The number of paired samples between Baseline and C2D1: 4
## The number of paired samples between C2D1 and C1D1: 4
We evaluate sample quality from the distribution of reads as visualized in a boxplot of log counts.
boxplot(log2(exprsDat+1), ylab='log2(counts+1)',
xlab='samples', las=2, cex.axis = 0.25,
border = ifelse(apply(log2(exprsDat+1),2,median)==0,
'red','black'), main = "Distribution of log-transformed counts")
Due to this analysis being paired, samples with matching timepoints were retained across all comparisons. Samples without matching timepoints were removed.
##BaselinevC1D1
pairedsamples <- mat %>% filter(Baseline == 1 & C1D1 == 1) %>% rownames()
# select paired samples and exclude a time point from comparison
BaselinevC1D1sampAnnotFilter <- sampAnnot %>% filter(patientID %in% pairedsamples & timepoint != "C2D1")
# subset expression data
BaselinevC1D1exprsDatFilter <- exprsDat[,rownames(BaselinevC1D1sampAnnotFilter)]
##BaselinevC2D1
pairedsamples <- mat %>% filter(Baseline == 1 & C2D1 == 1) %>% rownames()
# select paired samples and exclude a time point from comparison
BaselinevC2D1sampAnnotFilter <- sampAnnot %>% filter(patientID %in% pairedsamples & timepoint != "C1D1")
# subset expression data
BaselinevC2D1exprsDatFilter <- exprsDat[,rownames(BaselinevC2D1sampAnnotFilter)]
##C1D1vC2D1
pairedsamples <- mat %>% filter(C1D1 == 1 & C2D1 == 1) %>% rownames()
# select paired samples and exclude a time point from comparison
C1D1vC2D1sampAnnotFilter <- sampAnnot %>% filter(patientID %in% pairedsamples & timepoint != "Baseline")
# subset expression data
C1D1vC2D1exprsDatFilter <- exprsDat[,rownames(C1D1vC2D1sampAnnotFilter)]
##AllFinalSamples
sampAnnotPaired <- sampAnnot[rownames(sampAnnot) %in% c(rownames(BaselinevC1D1sampAnnotFilter),rownames(BaselinevC2D1sampAnnotFilter),rownames(C1D1vC2D1sampAnnotFilter)),]
exprsDatPaired <- exprsDat[,rownames(sampAnnotPaired)]
For the retained samples, we run differential expression analysis
across time with DESeq2.
Estimated fold changes are shrunk with ashr using
lfcShrink to account for the variation in the samples in
this dataset.
#DE
#=======================
##Baseline V Entino
# get samples from baseline and C1D1
samp <- BaselinevC1D1sampAnnotFilter[!is.na(BaselinevC1D1sampAnnotFilter$timepoint),]
sampIds <- rownames(samp)
# create a DESeq object
BaselinevC1D1dds <- DESeqDataSetFromMatrix(
countData = round(BaselinevC1D1exprsDatFilter)[,colnames(BaselinevC1D1exprsDatFilter) %in% sampIds],
colData = samp,
design=~patientID+timepoint)
BaselinevC1D1dds <- DESeq(BaselinevC1D1dds)
# get the result table
BaselinevC1D1res <- results(BaselinevC1D1dds, contrast = c("timepoint","C1D1", "Baseline"))
BaselinevC1D1res <- lfcShrink(BaselinevC1D1dds, contrast = c("timepoint","C1D1", "Baseline"), res=BaselinevC1D1res, type="ashr")
BaselinevC1D1res <- BaselinevC1D1res[!is.na(BaselinevC1D1res$padj),]
#=======================
##Baseline V C2D1
samp <- BaselinevC2D1sampAnnotFilter[!is.na(BaselinevC2D1sampAnnotFilter$timepoint),]
sampIds <- rownames(samp)
##DE by Time
BaselinevC2D1dds <- DESeqDataSetFromMatrix(countData =
round(BaselinevC2D1exprsDatFilter)[,colnames(BaselinevC2D1exprsDatFilter) %in% sampIds],
colData = samp,
design=~patientID+timepoint)
BaselinevC2D1dds <- DESeq(BaselinevC2D1dds)
BaselinevC2D1res <- results(BaselinevC2D1dds, contrast = c("timepoint","C2D1", "Baseline"))
BaselinevC2D1res <- lfcShrink(BaselinevC2D1dds, contrast = c("timepoint","C2D1", "Baseline"), res=BaselinevC2D1res, type="ashr")
BaselinevC2D1res <- BaselinevC2D1res[!is.na(BaselinevC2D1res$padj),]
#=======================
##Entino v C2D1
# DE
samp <- C1D1vC2D1sampAnnotFilter[!is.na(C1D1vC2D1sampAnnotFilter$timepoint),]
sampIds <- rownames(samp)
##DE by Time
C1D1vC2D1dds <- DESeqDataSetFromMatrix(countData =
round(C1D1vC2D1exprsDatFilter)[,colnames(C1D1vC2D1exprsDatFilter) %in% sampIds],
colData = samp,
design=~patientID+timepoint)
C1D1vC2D1dds <- DESeq(C1D1vC2D1dds)
C1D1vC2D1res <- results(C1D1vC2D1dds, contrast = c("timepoint","C2D1", "C1D1"))
C1D1vC2D1res <- lfcShrink(C1D1vC2D1dds, contrast = c("timepoint","C2D1", "C1D1"), res=C1D1vC2D1res, type="ashr")
C1D1vC2D1res <- C1D1vC2D1res[!is.na(C1D1vC2D1res$padj),]
#Extract Stats for Pathway Analysis
BaselinevC1D1stats <- BaselinevC1D1res$log2FoldChange
names(BaselinevC1D1stats) <- row.names(BaselinevC1D1res)
BaselinevC2D1stats <- BaselinevC2D1res$log2FoldChange
names(BaselinevC2D1stats) <- row.names(BaselinevC2D1res)
C1D1vC2D1stats <- C1D1vC2D1res$log2FoldChange
names(C1D1vC2D1stats) <- row.names(C1D1vC2D1res)
For each comparison, a datatable of results and volcanoplot will be provided. Plotted on the y-axis is the adjusted p-value, on the x-axis is the Log2 Fold Change.
Positive Log2 Fold Change = enriched in C1D1 samples. Negative Log2 Fold Change = enriched in Baseline samples.
Saved are going to be the genes with an absolute Log2FoldChange > 0.5 and a Padj < 0.05
##BaselinevC1D1
datatable(data.frame(BaselinevC1D1res))
#volcano plot
EnhancedVolcano(BaselinevC1D1res, lab=row.names(BaselinevC1D1res),
x='log2FoldChange', y='padj', FCcutoff = 0.5,
pCutoff = 0.05, title = "Paired DE Results by Time -- BaselinevC1D1")
# significantly DE genes
BaselinevC1D1res_TopDE <- BaselinevC1D1res[BaselinevC1D1res$padj < 0.05 &
abs(BaselinevC1D1res$log2FoldChange) > 0.5,]
#order by logFC
BaselinevC1D1res_TopDE = BaselinevC1D1res_TopDE[order(BaselinevC1D1res_TopDE$log2FoldChange, decreasing = T),]
# the total number if significantly DE genes
cat("The number of significantly DE genes:\n",
nrow(BaselinevC1D1res_TopDE))
## The number of significantly DE genes:
## 1
# count how many up- and down-regulated genes
cat("The number of down- and up-regulated genes:\n",
table(BaselinevC1D1res_TopDE$log2FoldChange >0))
## The number of down- and up-regulated genes:
## 1
write.csv(BaselinevC1D1res_TopDE, paste0(rootDir,"output_data/Baseline_vs_C1D1_TopDE.csv"))
Positive Log2 Fold Change = enriched in C2D1 samples. Negative Log2 Fold Change = enriched in Baseline samples.
Saved are going to be the genes with an absolute Log2FoldChange > 0.5 and a Padj < 0.05
##BaselinevC2D1
datatable(data.frame(BaselinevC2D1res))
# volcano
EnhancedVolcano(BaselinevC2D1res, lab=row.names(BaselinevC2D1res),
x='log2FoldChange', y='padj', FCcutoff = 0.5,
pCutoff = 0.05, title = "Paired DE Results by Time -- BaselinevC2D1")
# significantly DE genes
BaselinevC2D1res_TopDE <- BaselinevC2D1res[BaselinevC2D1res$padj < 0.05 &
abs(BaselinevC2D1res$log2FoldChange) > 0.5,]
#order by logFC
BaselinevC2D1res_TopDE = BaselinevC2D1res_TopDE[order(BaselinevC2D1res_TopDE$log2FoldChange, decreasing = T),]
# the total number if significantly DE genes
cat("The number of significantly DE genes:\n",
nrow(BaselinevC2D1res_TopDE))
## The number of significantly DE genes:
## 383
# count how many up- and down-regulated genes
cat("The number of down- and up-regulated genes:\n",
table(BaselinevC2D1res_TopDE$log2FoldChange >0))
## The number of down- and up-regulated genes:
## 121 262
write.csv(BaselinevC2D1res_TopDE, paste0(rootDir,"output_data/Baseline_vs_C2D1_TopDE.csv"))
Positive Log2 Fold Change = enriched in C2D1 samples. Negative Log2 Fold Change = enriched in C1D1 samples.
Saved are going to be the genes with an absolute Log2FoldChange > 0.5 and a Padj < 0.05
##C1D1vC2D1
datatable(data.frame(C1D1vC2D1res))
# volcano plot
EnhancedVolcano(C1D1vC2D1res, lab=row.names(C1D1vC2D1res),
x='log2FoldChange', y='padj', FCcutoff = 0.5,
pCutoffCol = 'padj', pCutoff = 0.05, title = "Paired DE Results by Time -- C1D1vC2D1")
# significantly DE genes
C1D1vC2D1res_TopDE <- C1D1vC2D1res[C1D1vC2D1res$padj < 0.05 &
abs(C1D1vC2D1res$log2FoldChange) > 0.5,]
#order by logFC
C1D1vC2D1res_TopDE = C1D1vC2D1res_TopDE[order(C1D1vC2D1res_TopDE$log2FoldChange, decreasing = T),]
# the total number if significantly DE genes
cat("The number of significantly DE genes:\n",
nrow(C1D1vC2D1res_TopDE))
## The number of significantly DE genes:
## 346
# count how many up- and down-regulated genes
cat("The number of down- and up-regulated genes:\n",
table(C1D1vC2D1res_TopDE$log2FoldChange >0))
## The number of down- and up-regulated genes:
## 46 300
write.csv(C1D1vC2D1res_TopDE, paste0(rootDir,"output_data/C1D1_vs_C2D1_TopDE.csv"))
Gene set statistics are run with fgsea with Hallmark pathways from MSigDb.
Positive Normalized Enrichment Score = enriched in C1D1 samples. Negative Normalized Enrichment Score = enriched in Baseline samples.
# Get Pathways
hallmark_df = msigdbr(species = "Homo sapiens", category = "H")
hallmark_list = hallmark_df %>% split(x = .$gene_symbol, f = .$gs_name)
# run fgsea
BaselinevC1D1gsResults <- as.data.frame(fgsea(pathways=hallmark_list, stats=BaselinevC1D1stats))
#BaselinevC1D1
datatable(BaselinevC1D1gsResults)
fwrite(BaselinevC1D1gsResults, file=paste0(rootDir,"output_data/BaselinevC1D1_HALLMARK.csv"))
Positive Normalized Enrichment Score = enriched in C2D1 samples. Negative Normalized Enrichment Score = enriched in Baseline samples.
#Complete Pathway Analysis
BaselinevC2D1gsResults <- as.data.frame(fgsea(pathways=hallmark_list, stats=BaselinevC2D1stats))
#BaselinevC2D1
datatable(BaselinevC2D1gsResults)
fwrite(BaselinevC2D1gsResults, file=paste0(rootDir,"output_data/BaselinevC2D1_HALLMARK.csv"))
Positive Normalized Enrichment Score = enriched in C2D1 samples. Negative Normalized Enrichment Score = enriched in C1D1 samples.
C1D1vC2D1gsResults <- as.data.frame(fgsea(pathways=hallmark_list, stats=C1D1vC2D1stats))
#C1D1vC2D1
datatable(C1D1vC2D1gsResults)
fwrite(C1D1vC2D1gsResults, file=paste0(rootDir,"output_data/C1D1vC2D1_HALLMARK.csv"))
Create a dotplot with GSEA results for different comparisons. Plot significant pathways only (adjusted p-values < 0.05). Color is the normalized enrichment score. Red/blue means higher/lower expression on the second time point.
# merge pathwyay analysis results for plotting
pathRes = rbind(cbind(comparison = rep("Baseline_vs_C1D1",nrow(BaselinevC1D1gsResults)),BaselinevC1D1gsResults),
cbind(comparison = rep("Baseline_vs_C2D1",nrow(BaselinevC2D1gsResults)),BaselinevC2D1gsResults),
cbind(comparison = rep("C1D1_vs_C2D1",nrow(C1D1vC2D1gsResults)),C1D1vC2D1gsResults))
# make NES for not sugnificant pathways as NA
pathRes_ns = pathRes %>% mutate(NES, NES = ifelse(padj >0.05, NA, NES))
# plot: dot plot
p = ggplot(data = pathRes_ns %>% filter(!is.na(NES)), aes(x = comparison, y = pathway, color = NES)) +
geom_point(size = 4) +
# scale_color_gradient(low = "red", high = "blue") +
scale_colour_gradient2(low = "blue", high = "red") +
scale_x_discrete(labels = c("Baseline vs C1D1", "Baseline vs C2D1","C1D1 vs C2D1")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5))
print(p)
sessionInfo()
## R version 4.3.2 (2023-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] pheatmap_1.0.12 scales_1.3.0
## [3] Matrix_1.6-3 data.table_1.14.8
## [5] BiocManager_1.30.22 survival_3.5-7
## [7] binom_1.1-1.1 flextable_0.9.4
## [9] survminer_0.4.9 msigdbr_7.5.1
## [11] fgsea_1.26.0 EnhancedVolcano_1.18.0
## [13] ComplexHeatmap_2.16.0 DESeq2_1.40.2
## [15] SummarizedExperiment_1.30.2 Biobase_2.60.0
## [17] MatrixGenerics_1.12.3 matrixStats_1.1.0
## [19] GenomicRanges_1.52.1 GenomeInfoDb_1.36.4
## [21] IRanges_2.34.1 S4Vectors_0.38.2
## [23] BiocGenerics_0.46.0 kableExtra_1.3.4
## [25] pals_1.8 gridExtra_2.3
## [27] DT_0.30 ggrepel_0.9.4
## [29] ggpubr_0.6.0 ggplot2_3.4.4
## [31] dplyr_1.1.2 knitr_1.45
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.2 later_1.3.1 bitops_1.0-7
## [4] tibble_3.2.1 lifecycle_1.0.4 rstatix_0.7.2
## [7] mixsqp_0.3-54 doParallel_1.0.17 lattice_0.21-9
## [10] crosstalk_1.2.1 backports_1.4.1 magrittr_2.0.3
## [13] sass_0.4.8 rmarkdown_2.25 jquerylib_0.1.4
## [16] yaml_2.3.7 httpuv_1.6.12 zip_2.3.0
## [19] askpass_1.2.0 cowplot_1.1.2 mapproj_1.2.11
## [22] RColorBrewer_1.1-3 maps_3.4.1.1 abind_1.4-5
## [25] zlibbioc_1.46.0 rvest_1.0.3 purrr_1.0.2
## [28] RCurl_1.98-1.13 gdtools_0.3.4 circlize_0.4.15
## [31] GenomeInfoDbData_1.2.10 KMsurv_0.1-5 irlba_2.3.5.1
## [34] crul_1.4.0 svglite_2.1.2 commonmark_1.9.0
## [37] codetools_0.2-19 DelayedArray_0.26.7 ggtext_0.1.2
## [40] xml2_1.3.5 tidyselect_1.2.0 shape_1.4.6
## [43] httpcode_0.3.0 farver_2.1.1 webshot_0.5.5
## [46] jsonlite_1.8.7 GetoptLong_1.0.5 ellipsis_0.3.2
## [49] iterators_1.0.14 systemfonts_1.0.5 foreach_1.5.2
## [52] tools_4.3.2 ragg_1.2.6 snow_0.4-4
## [55] Rcpp_1.0.11 glue_1.6.2 xfun_0.39
## [58] withr_2.5.2 fastmap_1.1.1 fansi_1.0.4
## [61] openssl_2.1.1 digest_0.6.33 truncnorm_1.0-9
## [64] R6_2.5.1 mime_0.12 textshaping_0.3.7
## [67] colorspace_2.1-0 dichromat_2.0-0.1 markdown_1.12
## [70] utf8_1.2.3 tidyr_1.3.0 generics_0.1.3
## [73] fontLiberation_0.1.0 httr_1.4.7 htmlwidgets_1.6.4
## [76] S4Arrays_1.0.6 pkgconfig_2.0.3 gtable_0.3.4
## [79] XVector_0.40.0 survMisc_0.5.6 htmltools_0.5.7
## [82] fontBitstreamVera_0.1.1 carData_3.0-5 clue_0.3-65
## [85] png_0.1-8 ashr_2.2-63 km.ci_0.5-6
## [88] rstudioapi_0.15.0 reshape2_1.4.4 rjson_0.2.21
## [91] uuid_1.1-1 curl_5.1.0 zoo_1.8-12
## [94] cachem_1.0.8 GlobalOptions_0.1.2 stringr_1.5.1
## [97] parallel_4.3.2 pillar_1.9.0 vctrs_0.6.3
## [100] promises_1.2.1 car_3.1-2 xtable_1.8-4
## [103] cluster_2.1.4 evaluate_0.23 invgamma_1.1
## [106] cli_3.6.1 locfit_1.5-9.8 compiler_4.3.2
## [109] rlang_1.1.1 crayon_1.5.2 SQUAREM_2021.1
## [112] ggsignif_0.6.4 labeling_0.4.3 plyr_1.8.9
## [115] stringi_1.8.1 viridisLite_0.4.2 BiocParallel_1.34.2
## [118] babelgene_22.9 munsell_0.5.0 fontquiver_0.2.1
## [121] gfonts_0.2.0 shiny_1.8.0 highr_0.10
## [124] gridtext_0.1.5 broom_1.0.5 bslib_0.6.1
## [127] fastmatch_1.1-4 officer_0.6.3